The goal of our project is to predict Y. edit, include key findings
Target variable:
Motivation: edit here re relevance, cite some literature
Relevant features: edit
Methods: edit
We use the New York Police Department Stop, Question and Frisk Data from 2023.
Each observation of this dataset is a unique stop made by an NYPD police officer in 2023 as part of the SQF programme.
How data is collected & what universe is/is not observed
Temporal and spatial span.
The data has 16971 observations and 82 features, as shown below.
We begin the project by installing and loading in the necessary libraries.
# Install and load required packages
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
library(pacman)
p_load(readxl, dplyr, ggplot2, knitr, lubridate, tidyr, sf, httr, caret, glmnet, stringr, remotes, RColorBrewer, viridis)
We proceed by setting up the file path and importing the data.
# get raw content of the file
response <- GET("https://raw.githubusercontent.com/rrachelkane/data-science-group-project/main/data/sqf-2023.xlsx")
# retrieve the .xlsx file
if (status_code(response) == 200) {
# create a temporary file to save the downloaded content
temp_file <- tempfile(fileext = ".xlsx")
# Write the raw content to the temporary file
writeBin(content(response, "raw"), temp_file)
# Read the Excel file from the temporary file
sqf_data <- read_xlsx(temp_file)
# View the first few rows of the data
head(sqf_data)
} else {
stop("Failed to download the file.")
}
## # A tibble: 6 Ć 82
## STOP_ID STOP_FRISK_DATE STOP_FRISK_TIME YEAR2 MONTH2 DAY2 STOP_WAS_INITIATED
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 1 2023-01-01 00:44:00 2023 January Sund⦠Based on Radio Run
## 2 2 2023-01-01 00:49:00 2023 January Sund⦠Based on Self Iniā¦
## 3 3 2023-01-01 05:31:00 2023 January Sund⦠Based on Radio Run
## 4 4 2023-01-01 04:59:00 2023 January Sund⦠Based on Self Iniā¦
## 5 5 2023-01-01 05:21:00 2023 January Sund⦠Based on Self Iniā¦
## 6 6 2023-01-01 18:00:00 2023 January Sund⦠Based on Radio Run
## # ā¹ 75 more variables: RECORD_STATUS_CODE <chr>, ISSUING_OFFICER_RANK <chr>,
## # ISSUING_OFFICER_COMMAND_CODE <chr>, SUPERVISING_OFFICER_RANK <chr>,
## # SUPERVISING_OFFICER_COMMAND_CODE <chr>,
## # SUPERVISING_ACTION_CORRESPONDING_ACTIVITY_LOG_ENTRY_REVIEWED <chr>,
## # LOCATION_IN_OUT_CODE <chr>, JURISDICTION_CODE <chr>,
## # JURISDICTION_DESCRIPTION <chr>, OBSERVED_DURATION_MINUTES <dbl>,
## # SUSPECTED_CRIME_DESCRIPTION <chr>, STOP_DURATION_MINUTES <dbl>, ā¦
# check original dimensions
dim(sqf_data)
## [1] 16971 82
# view head
head(sqf_data)
## # A tibble: 6 Ć 82
## STOP_ID STOP_FRISK_DATE STOP_FRISK_TIME YEAR2 MONTH2 DAY2 STOP_WAS_INITIATED
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 1 2023-01-01 00:44:00 2023 January Sund⦠Based on Radio Run
## 2 2 2023-01-01 00:49:00 2023 January Sund⦠Based on Self Iniā¦
## 3 3 2023-01-01 05:31:00 2023 January Sund⦠Based on Radio Run
## 4 4 2023-01-01 04:59:00 2023 January Sund⦠Based on Self Iniā¦
## 5 5 2023-01-01 05:21:00 2023 January Sund⦠Based on Self Iniā¦
## 6 6 2023-01-01 18:00:00 2023 January Sund⦠Based on Radio Run
## # ā¹ 75 more variables: RECORD_STATUS_CODE <chr>, ISSUING_OFFICER_RANK <chr>,
## # ISSUING_OFFICER_COMMAND_CODE <chr>, SUPERVISING_OFFICER_RANK <chr>,
## # SUPERVISING_OFFICER_COMMAND_CODE <chr>,
## # SUPERVISING_ACTION_CORRESPONDING_ACTIVITY_LOG_ENTRY_REVIEWED <chr>,
## # LOCATION_IN_OUT_CODE <chr>, JURISDICTION_CODE <chr>,
## # JURISDICTION_DESCRIPTION <chr>, OBSERVED_DURATION_MINUTES <dbl>,
## # SUSPECTED_CRIME_DESCRIPTION <chr>, STOP_DURATION_MINUTES <dbl>, ā¦
First, we change column names from strictly upper case to strictly lower case, because itās cuter.
colnames(sqf_data) <- tolower(colnames(sqf_data))
# check
colnames(sqf_data)[1:3]
## [1] "stop_id" "stop_frisk_date" "stop_frisk_time"
Next, we drop all columns which cannot be used for our prediction question, as they are realized after the outcome of interest, namely X, or are irrelevant for other reasons. We drop spatial features other than police precinct of the stop and x and y coordinates of the stop.
sqf_data <- sqf_data %>%
select(- c("stop_frisk_date", "record_status_code", "supervising_action_corresponding_activity_log_entry_reviewed", "stop_location_sector_code", "stop_location_apartment", "stop_location_full_address", "stop_location_patrol_boro_name", "stop_location_street_name", "suspect_other_description"))
# officer not explained stop description
# this needs to be edited depending on what we choose for y, might drop observed_duration_minutes, top_duration, firearm etc flags, physical_force_ etc flags, suspects_actions (unclear when it occurs)
# check new dim
dim(sqf_data)
## [1] 16971 73
First, we note that there is only 1 column with any instance of an
NA value.
na_cols <- colMeans(is.na(sqf_data)) * 100
na_cols[na_cols > 0]
## demeanor_of_person_stopped
## 15.01385
The process generating the missingness of
demeanor_of_person_stopped is unclear. Imputation of this
would be difficult, so we drop this column.
# drop
sqf_data <- sqf_data %>%
select(-("demeanor_of_person_stopped"))
# check new dim
dim(sqf_data)
## [1] 16971 72
Additionally, there are many observations in the data with values ==
(null) across different columns.
# get % of nulls, in columns with at least one null
null_cols <- (colMeans(sqf_data == "(null)") * 100)[colMeans(sqf_data == "(null)") * 100 > 0]
# make df for plot
null_cols_df <- data.frame(Feature = names(null_cols), Percentage = null_cols)
dim(null_cols_df)
## [1] 48 2
# order for plot
null_cols_df$Feature <- factor(null_cols_df$Feature,
levels = null_cols_df$Feature[order(null_cols_df$Percentage, decreasing = FALSE)])
# plot
ggplot(null_cols_df, aes(x = Feature, y = Percentage)) +
geom_bar(stat = "identity", fill = "lightblue", color = "darkblue") +
labs(title = "Percentage of (null) Values per Column",
x = "Columns",
y = "Percentage of (null) Values") +
coord_flip() + # Flip coordinates
theme_minimal()
Note, however, that not all of these (null) observations
are equivalent:
(null) values, (null) means the data are
genuinely effectively NA, as there are
instances of both āYā and āNā (for binary variable for example),
alongside (null).sqf_data %>%
group_by(ask_for_consent_flg) %>%
summarise(N = n()) %>%
kable()
| ask_for_consent_flg | N |
|---|---|
| (null) | 779 |
| N | 14187 |
| Y | 2005 |
null values are actually
used as āNā.print(unique(sqf_data$firearm_flag))
## [1] "(null)" "Y"
sqf_data %>%
group_by(weapon_found_flag, firearm_flag) %>%
summarise(N = n()) %>%
kable()
## `summarise()` has grouped output by 'weapon_found_flag'. You can override using
## the `.groups` argument.
| weapon_found_flag | firearm_flag | N |
|---|---|---|
| N | (null) | 14310 |
| N | Y | 28 |
| Y | (null) | 1495 |
| Y | Y | 1138 |
Note here that even though a firearm_flag has a
"Y" entry and weapon_found_flag has a
"N" entry, this is not necessarily incorrect, as the
officer can have identified a firearm without having to carry out a
frisk nor a `search.
We deal with these cases of (null) separately:
(null) with
"N" values# initialize empty vector
null_2 <- c()
# loop through columns
for (col in names(sqf_data)) {
# get unique values of the col
unique_values <- unique(sqf_data[[col]])
# if the unique values are exactly "Y" and "(null)"
if (all(unique_values %in% c("Y", "(null)")) && length(unique_values) == 2) {
null_2 <- c(null_2, col) # add column name to null_2
}
}
# check n of type 2 nulls
length(null_2)
## [1] 27
# pre-clean check
print(unique(sqf_data$firearm_flag))
## [1] "(null)" "Y"
# replace these nulls with Ns
sqf_data <- sqf_data %>%
mutate(across(all_of(null_2), ~ ifelse(. == "(null)", "N", .)))
# post-clean check
print(unique(sqf_data$firearm_flag))
## [1] "N" "Y"
NA values:# initialize empty vector
null_1 <- c()
# loop through columns
for (col in names(sqf_data)) {
# for columns not in null_2
if (!(col %in% null_2)) {
# if "(null)" is present in the column
if ("(null)" %in% sqf_data[[col]]) {
null_1 <- c(null_1, col) # add column name to the vector
}
}
}
# check length
length(null_1)
## [1] 21
# pre-clean check
print(unique(sqf_data$ask_for_consent_flg))
## [1] "N" "Y" "(null)"
# replace these with NAs
sqf_data <- sqf_data %>%
mutate(across(all_of(null_1), ~ ifelse(. == "(null)", NA, .)))
# post-clean check
print(unique(sqf_data$ask_for_consent_flg))
## [1] "N" "Y" NA
Now, we percentage of actual missing values, correctly identified by
"NA":
# get % of NAs, in columns with at least one NA
na_cols <- (colMeans(is.na(sqf_data)) * 100)[colMeans(is.na(sqf_data)) * 100 > 0]
# make df for plot
na_cols_df <- data.frame(Feature = names(na_cols), Percentage = na_cols)
# order for plot
na_cols_df$Feature <- factor(na_cols_df$Feature,
levels = na_cols_df$Feature[order(na_cols_df$Percentage, decreasing = FALSE)])
# plot
ggplot(na_cols_df, aes(x = Feature, y = Percentage)) +
geom_bar(stat = "identity", fill = "#F8566D", color = "black") +
labs(title = "Percentage of NA Values per Column",
x = "Columns",
y = "Percentage of NA Values") +
coord_flip() + # Flip coordinates
theme_minimal()
Given the above, we
sqf_data <- sqf_data %>%
select(-all_of(names(na_cols[na_cols > 25])))
dim(sqf_data)
## [1] 16971 63
sqf_data <- sqf_data %>%
filter(!if_any(everything(), is.na))
dim(sqf_data)
## [1] 12095 63
We change the coding of binary variables from "Y" and
"N":
# pre check
print(unique(sqf_data$frisked_flag))
## [1] "Y" "N"
# clean
sqf_data <- sqf_data %>%
mutate(across(where(~ all(. %in% c("Y", "N"))), ~ ifelse(. == "Y", 1, ifelse(. == "N", 0, .))))
# post check
print(unique(sqf_data$frisked_flag))
## [1] "1" "0"
We extract the hour of the date from
stop_frisk_time:
sqf_data <- sqf_data %>%
mutate(stop_hour = as.factor(str_extract(stop_frisk_time, "^\\d{2}")))
print(unique(sqf_data$stop_hour))
## [1] 00 04 05 18 19 15 20 16 09 02 23 21 11 03 01 14 12 17 07 22 13 10 08 06
## 24 Levels: 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 ... 23
# remove stop_frisk_time ?
sqf_data <- sqf_data %>%
select(- "stop_frisk_time")
We also convert other relevant variables to factors as needed:
sqf_data <- sqf_data %>%
mutate(across(c(month2, day2, stop_was_initiated, issuing_officer_rank, supervising_officer_rank, suspected_crime_description, suspect_sex, suspect_race_description, suspect_body_build_type, suspect_eye_color, suspect_hair_color)))
# note delete eye color etc here if not using for prediction (drop to begin with)
We also make height and age numeric:
sqf_data <- sqf_data %>%
mutate(suspect_reported_age = as.numeric(suspect_reported_age),
suspect_height = as.numeric(suspect_height),
suspect_weight = as.numeric(suspect_weight))
Here we explore some basic summary statistics for the entire dataset.
# some overall basics
head(sqf_data)
## # A tibble: 6 Ć 63
## stop_id year2 month2 day2 stop_was_initiated issuing_officer_rank
## <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 1 2023 January Sunday Based on Radio Run POM
## 2 2 2023 January Sunday Based on Self Initiated POM
## 3 4 2023 January Sunday Based on Self Initiated POM
## 4 5 2023 January Sunday Based on Self Initiated POF
## 5 6 2023 January Sunday Based on Radio Run POF
## 6 7 2023 January Sunday Based on Self Initiated POM
## # ā¹ 57 more variables: issuing_officer_command_code <chr>,
## # supervising_officer_rank <chr>, supervising_officer_command_code <chr>,
## # observed_duration_minutes <dbl>, suspected_crime_description <chr>,
## # stop_duration_minutes <dbl>, officer_explained_stop_flag <chr>,
## # other_person_stopped_flag <chr>, suspect_arrested_flag <chr>,
## # summons_issued_flag <chr>, officer_in_uniform_flag <chr>,
## # frisked_flag <chr>, searched_flag <chr>, ask_for_consent_flg <chr>, ā¦
# sex
ggplot(sqf_data, aes(x = suspect_sex)) +
geom_bar() +
labs(title = "Distribution of Suspect Sex",
x = "Sex",
y = "Count") +
coord_flip() +
theme_minimal()
# race
ggplot(sqf_data, aes(x = suspect_race_description)) +
geom_bar() +
labs(title = "Distribution of Suspect Race",
x = "Sex",
y = "Count") +
coord_flip() +
theme_minimal()
# age
# height
# weight
# time of stop
# suspected crime
# search, frisk, arrest
# then do for Y and X of interest, grouped tables, hists, scatters, corr plots
# check for any outliers and transform as needed
# generate features as needed
# may need to do this before/after spatial viz based on subsetting, generation etc
Next, we explore specific summary statistic for outcome of interest -
weapon_found_flag.
# if looking at cpw weapon question
cpw_stops <- sqf_data %>%
filter(suspected_crime_description == "CPW")
dim(cpw_stops)
## [1] 6222 63
cpw_stops %>%
group_by(weapon_found_flag) %>%
summarise(N = n(),
Pc = N / nrow(cpw_stops) * 100) %>%
arrange(desc(N)) %>%
kable(booktabs = TRUE, col.names = c("Weapon Found on Suspect", "N CPW Stops", "% Total CPW Stops"), align = "l")
| Weapon Found on Suspect | N CPW Stops | % Total CPW Stops |
|---|---|---|
| 0 | 4707 | 75.65092 |
| 1 | 1515 | 24.34908 |
# bar plot
ggplot(data = cpw_stops, aes(x = suspect_race_description, fill = weapon_found_flag)) +
geom_bar() +
coord_flip() +
theme_minimal() +
xlab("Suspect Race") +
ylab("N Observations") +
scale_fill_brewer(type = "qual", palette = "Spectral", name = "Weapon Found Flag") +
labs(title = "Weapon Found in CPW Stops, By Race")
# bar plot 100%
ggplot(data = cpw_stops, aes(x = suspect_race_description, fill = weapon_found_flag)) +
geom_bar(position = "fill") +
coord_flip() +
theme_minimal() +
xlab("Suspect Race") +
ylab("N Observations") +
scale_fill_brewer(type = "qual", palette = "Spectral", name = "Weapon Found Flag") +
labs(title = "Weapon Found in CPW Stops, By Race")
dim(sqf_data)
## [1] 12095 63
# drop 7 observations which have incorrect spatial info
sqf_data <- sqf_data %>%
filter(stop_location_x > 0)
dim(sqf_data)
## [1] 12088 63
# make spatial object
sqf_data_sf <- st_as_sf(sqf_data,
coords = c("stop_location_x", "stop_location_y"),
crs = 2263) # CRS for New York (EPSG:2263)
# load in precinct-level shapefile
remotes::install_github("mfherman/nycgeo")
## Skipping install of 'nycgeo' from a github remote, the SHA1 (4fee55c1) has not changed since last install.
## Use `force = TRUE` to force installation
library(nycgeo)
nypd_shp <- nycgeo::nyc_boundaries(geography = "police")
# check crs is also EPSG 2263
st_crs(nypd_shp)
## old-style crs object detected; please recreate object with a recent sf::st_crs()
## Coordinate Reference System:
## User input: EPSG:2263
## wkt:
## PROJCRS["NAD83 / New York Long Island (ftUS)",
## BASEGEOGCRS["NAD83",
## DATUM["North American Datum 1983",
## ELLIPSOID["GRS 1980",6378137,298.257222101,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## ID["EPSG",4269]],
## CONVERSION["SPCS83 New York Long Island zone (US Survey feet)",
## METHOD["Lambert Conic Conformal (2SP)",
## ID["EPSG",9802]],
## PARAMETER["Latitude of false origin",40.1666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8821]],
## PARAMETER["Longitude of false origin",-74,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8822]],
## PARAMETER["Latitude of 1st standard parallel",41.0333333333333,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8823]],
## PARAMETER["Latitude of 2nd standard parallel",40.6666666666667,
## ANGLEUNIT["degree",0.0174532925199433],
## ID["EPSG",8824]],
## PARAMETER["Easting at false origin",984250,
## LENGTHUNIT["US survey foot",0.304800609601219],
## ID["EPSG",8826]],
## PARAMETER["Northing at false origin",0,
## LENGTHUNIT["US survey foot",0.304800609601219],
## ID["EPSG",8827]]],
## CS[Cartesian,2],
## AXIS["easting (X)",east,
## ORDER[1],
## LENGTHUNIT["US survey foot",0.304800609601219]],
## AXIS["northing (Y)",north,
## ORDER[2],
## LENGTHUNIT["US survey foot",0.304800609601219]],
## USAGE[
## SCOPE["Engineering survey, topographic mapping."],
## AREA["United States (USA) - New York - counties of Bronx; Kings; Nassau; New York; Queens; Richmond; Suffolk."],
## BBOX[40.47,-74.26,41.3,-71.8]],
## ID["EPSG",2263]]
# plot stop data by weapon found
ggplot() +
geom_sf(data = nypd_shp, fill = "lightblue", color = "black", size = 0.3) +
geom_sf(data = sqf_data_sf, aes(color = as.factor(weapon_found_flag)), size = 0.7, alpha = 0.4) +
scale_color_manual(values = c("red", "seagreen"),
labels = c("Weapon Found", "No Weapon Found")) +
theme_minimal() +
labs(title = "NYC Police Stops with Weapon Found Flag",
subtitle = "Stops plotted on NYC precinct boundaries") +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.title = element_blank())
## old-style crs object detected; please recreate object with a recent sf::st_crs()
## old-style crs object detected; please recreate object with a recent sf::st_crs()
# plot stop data by suspected crime
ggplot() +
geom_sf(data = nypd_shp, fill = "lightblue", color = "black", size = 0.3) +
geom_sf(data = sqf_data_sf, aes(color = as.factor(suspect_arrested_flag)), size = 0.7) +
scale_color_manual(values = c("red", "seagreen"),
labels = c("Arrested", "Not Arrested")) +
theme_minimal() +
labs(title = "NYC Police Stops by Arrest Status",
subtitle = "Stops plotted on NYC precinct boundaries") +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.title = element_blank())
## old-style crs object detected; please recreate object with a recent sf::st_crs()
## old-style crs object detected; please recreate object with a recent sf::st_crs()
# can add acs pop here and check stats on this also
# zoom in on manhattan for example
We first run the following model: insert formula
# set seed for reproducibility
set.seed(1)
# split sample?
# gen train and test y
# gen train and test X
regressors_1 <- c()
# train_x <- model.matrix(~ . - 1, data = train %>% select(all_of(regressors_1)))
# test_x <- model.matrix(~ . - 1, data = test %>% select(all_of(regressors_1)))
#
# dim(train_x)
# dim(test_x)
#
# # run cv lasso
# lasso_arrest <- cv.glmnet(x = train_x, y = train_arrest, alpha = 1, nfolds = 10, family = "binomial")
#
# # get value of lambda that minimises cv mse in training sample
# coef(lasso_arrest, s = "lambda.min")
#
# # plot regression MSEs against log lambda
# plot(lasso_arrest)
#
# # plot coefficients against lambda
# plot(lasso_arrest$glmnet.fit, xvar = "lambda", label = TRUE)
# # get predicted values for this lambda with test data
# lasso_arrest_pred_test <- predict(lasso_arrest, s = lasso_arrest$lambda.min, newx = test_x,type = "response")
#
# mse_test <- mean((lasso_arrest_pred_test - test_arrest)^2)
#
# print(mse_test)
# rocs a
test test 2